Import the clinical data and the gene counts.
baseFolder <- "/savona/nobackup/biostat/datasets/heartMouse/"
sampleInfo <- read.csv(file.path(baseFolder, "mouseDietInfo.csv"))
countsGenes <- read.delim(file.path(baseFolder, "unnormalisedGeneCounts.txt"))
sampleInfo <- sampleInfo[match(colnames(countsGenes), sampleInfo[, "Sample"]), ]
sampleInfo[, "Diet"] <- factor(sampleInfo[, "Diet"], levels = c("Chow", "Glucose", "Sucrose"))
Draw an MDS plot.
library(edgeR)
## Loading required package: limma
dataset <- DGEList(countsGenes, group = sampleInfo[, "Diet"])
dataset <- calcNormFactors(dataset)
suppressPackageStartupMessages(library(plotly))
MDScoordinates <- plotMDS(dataset, plot = FALSE)
colours <- setNames(c("green", "blue", "red"), c("Chow", "Glucose", "Sucrose"))
plotData <- data.frame(sample = names(MDScoordinates[['x']]), x = MDScoordinates[['x']], y = MDScoordinates[['y']],
condition = sampleInfo[, "Diet"])
plot_ly(data = plotData, x = ~x, y = ~y, color = ~condition, text = ~sample, colors = colours, type = "scatter", mode = "markers") %>% layout(xaxis = list(zeroline = FALSE, title = "Leading logFC Dimension 1"), yaxis = list(zeroline = FALSE, title = "Leading logFC Dimension 2"))
The samples are decently mixed together.
Fit a model to all of the data and test the contrasts Glucose - Chow, Sucrose - Chow, Sucrose - Glucose.
keep <- filterByExpr(dataset, min.total.count = 200)
dataset <- dataset[keep, , keep.lib.sizes = FALSE]
design <- model.matrix(~ -1 + Diet, data = sampleInfo)
dataset <- estimateDisp(dataset, design, robust = TRUE)
fit <- glmQLFit(dataset, design)
geneTestGlucoseVsChow <- glmQLFTest(fit, contrast = c(-1, 1, 0))
geneTestSucroseVsChow <- glmQLFTest(fit, contrast = c(-1, 0, 1))
geneTestSucroseVsGlucose <- glmQLFTest(fit, contrast = c(0, -1, 1))
The significant genes for glucose vs. chow.
topTags(geneTestGlucoseVsChow, p.value = 0.01, n = Inf)
## Coefficient: -1*DietChow 1*DietGlucose
## logFC logCPM F PValue FDR
## Reln -0.5253951 4.4617076 59.15985 3.886952e-09 6.331068e-05
## Gm11627 0.4746246 2.1155585 35.55801 7.503845e-07 4.540332e-03
## Csrp2 0.4071251 6.7025345 34.57466 9.700616e-07 4.540332e-03
## Kcnd2 -0.4659080 2.8285563 34.04226 1.116488e-06 4.540332e-03
## Jsrp1 0.5868902 1.0272320 33.21074 1.393766e-06 4.540332e-03
## Arhgdib 0.3293382 4.7360521 31.27650 2.360792e-06 6.408763e-03
## Gm43359 -0.8651849 2.1667341 30.34816 3.057542e-06 6.863207e-03
## Gm43283 -1.7764294 -1.8201914 29.84141 3.526811e-06 6.863207e-03
## Gm42478 -1.2496504 -0.1138566 29.58539 3.792293e-06 6.863207e-03
## Gm43792 -1.5765900 -1.6757454 29.15706 4.284784e-06 6.979056e-03
## Gm38082 -0.9701733 0.7790731 27.58781 6.751534e-06 9.997181e-03
The significant genes for sucrose vs. chow.
topTags(geneTestSucroseVsChow, p.value = 0.01, n = Inf)
## Coefficient: -1*DietChow 1*DietSucrose
## logFC logCPM F PValue FDR
## Asns 0.7151135 3.25809040 48.92262 3.162545e-08 0.0004581015
## Reln -0.5757348 4.46170759 46.31325 5.625019e-08 0.0004581015
## Xist 7.1728617 4.81891570 42.87374 1.237480e-07 0.0006373641
## Tnfrsf9 1.4448659 1.89668518 41.87769 1.565236e-07 0.0006373641
## Mthfd2 0.4353115 4.01295445 39.86967 2.537806e-07 0.0008267157
## Muc1 3.1122469 0.09375315 32.64177 1.624835e-06 0.0044108859
## Ager 3.1268307 0.27260239 30.73876 2.741037e-06 0.0063780018
## Paqr9 0.4922399 3.42838443 29.64796 3.725521e-06 0.0075851601
## Cacng6 -0.7304731 2.68840022 29.09969 4.355709e-06 0.0078828645
Xist has a remarkably high log-fold change. It is an X chromosome inactivating lncRNA. Plots its FPKM values per treatment group.
library(ggplot2)
theme_set(theme_bw())
geneFPKMs <- read.delim(file.path(baseFolder, "geneFPKMs.txt"))
plotData <- data.frame(Xist = unlist(geneFPKMs["Xist", ]),
group = sampleInfo[match(colnames(geneFPKMs), sampleInfo[, "Sample"]), "Diet"])
ggplot(plotData, aes(y = Xist, x = group, fill = group)) + geom_dotplot(binaxis = 'y', stackdir = "center") + labs(x = "Diet", y = "Xist FPKM", fill = "Diet") + ggtitle("Relationship Between Xist and Diet") + theme(plot.title = element_text(hjust = 0.5), axis.text = element_text(colour = "black"))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Two mice are exaggerating the fold-change, but it could be an interesting sub-group nonetheless.
subset(plotData, Xist > 1)
## Xist group
## GC222 38.26 Sucrose
## SL44 39.66 Sucrose
The two mice are also outliers from other mice on the same diet in the MDS plot above.
The significant genes for sucrose vs. glucose.
topTags(geneTestSucroseVsGlucose, p.value = 0.01, n = Inf)
## Coefficient: -1*DietGlucose 1*DietSucrose
## logFC logCPM F PValue FDR
## Xist 9.030956 4.8189157 45.54416 6.689491e-08 0.001089584
## Ager 3.506059 0.2726024 32.91812 1.507926e-06 0.009523843
## Cbr2 4.834769 1.6214920 32.35958 1.754146e-06 0.009523843
Again, Xist has a large fold change, with the reason shown in the dot plot above.
Gene set testing using CAMERA.
suppressPackageStartupMessages(library(org.Mm.eg.db))
entrezIDs <- mapIds(org.Mm.eg.db, keys = rownames(dataset), keytype = "SYMBOL", column = "ENTREZID")
## 'select()' returned 1:1 mapping between keys and columns
genesToPathways <- getGeneKEGGLinks("mmu")
pathwayIDsToNames <- getKEGGPathwayNames("mmu", remove.qualifier = TRUE)
genesToPathways[, 2] <- pathwayIDsToNames[match(genesToPathways[, 2], pathwayIDsToNames[, 1]), 2]
entrezPathwaysList <- split(genesToPathways[, 1], genesToPathways[, 2])
indexList <- lapply(entrezPathwaysList, function(pathwayGenes) na.omit(match(pathwayGenes, entrezIDs)))
genesToPathways[, 1] <- rownames(dataset)[match(genesToPathways[, 1], entrezIDs)]
symbolsPathwaysList <- split(genesToPathways[, 1], genesToPathways[, 2])
symbolsPathwaysList <- sapply(symbolsPathwaysList, na.omit)
enrichmentResultsSucrose <- camera(dataset, indexList, contrast = c(-1, 0, 1))
enrichmentResultsSucroseVsGlucose <- camera(dataset, indexList, contrast = c(0, -1, 1))
Gene set test for glucose vs. chow.
head(camera(dataset, indexList, contrast = c(-1, 1, 0)), 10)
## NGenes Direction PValue FDR
## Ribosome 132 Up 1.118049e-41 3.779006e-39
## Oxidative phosphorylation 110 Up 1.205143e-18 2.036691e-16
## Parkinson disease 220 Up 1.648031e-11 1.856782e-09
## Coronavirus disease - COVID-19 189 Up 4.263024e-11 3.602255e-09
## Thermogenesis 201 Up 4.020325e-10 2.717740e-08
## Prion disease 234 Up 5.813549e-10 3.274966e-08
## Huntington disease 264 Up 2.802881e-09 1.353391e-07
## Non-alcoholic fatty liver disease 133 Up 5.225070e-09 2.207592e-07
## Amyotrophic lateral sclerosis 313 Up 1.490102e-08 5.596162e-07
## Proteasome 45 Up 6.059786e-08 2.048208e-06
The top pathways list appears to have many neurodegenerative-themed gene sets.
Gene set test for sucrose vs. chow.
head(camera(dataset, indexList, contrast = c(-1, 0, 1)), 10)
## NGenes Direction PValue
## Biosynthesis of amino acids 59 Up 2.645826e-06
## Glycolysis / Gluconeogenesis 46 Up 6.374198e-05
## Protein processing in endoplasmic reticulum 157 Up 1.531272e-04
## Aminoacyl-tRNA biosynthesis 44 Up 5.709906e-04
## Glutathione metabolism 50 Up 1.421423e-03
## Ribosome 132 Up 1.564870e-03
## Fluid shear stress and atherosclerosis 125 Up 1.864434e-03
## Fructose and mannose metabolism 29 Up 2.643509e-03
## Amino sugar and nucleotide sugar metabolism 45 Up 3.077129e-03
## Metabolism of xenobiotics by cytochrome P450 33 Up 4.133014e-03
## FDR
## Biosynthesis of amino acids 0.0008942893
## Glycolysis / Gluconeogenesis 0.0107723938
## Protein processing in endoplasmic reticulum 0.0172523346
## Aminoacyl-tRNA biosynthesis 0.0482487037
## Glutathione metabolism 0.0881543194
## Ribosome 0.0881543194
## Fluid shear stress and atherosclerosis 0.0900255135
## Fructose and mannose metabolism 0.1116882460
## Amino sugar and nucleotide sugar metabolism 0.1155632846
## Metabolism of xenobiotics by cytochrome P450 0.1318120881
The top pathways list appears to have many metabolism-themed gene sets. Visualise the Glycolysis / Gluconeogenesis pathway.
barcodeplot(geneTestSucroseVsChow[["table"]][["logFC"]], index = indexList[["Glycolysis / Gluconeogenesis"]], main = "Glycolysis / Gluconeogenesis")
The fold changes of most genes are tiny.
Gene set test for sucrose vs. glucose.
head(camera(dataset, indexList, contrast = c(0, -1, 1)), 10)
## NGenes Direction PValue FDR
## Ribosome 132 Down 5.065390e-32 1.712102e-29
## Oxidative phosphorylation 110 Down 6.446049e-18 1.089382e-15
## Parkinson disease 220 Down 7.451288e-10 8.395118e-08
## Systemic lupus erythematosus 96 Down 2.751606e-09 2.325107e-07
## Thermogenesis 201 Down 9.322862e-08 6.302255e-06
## Non-alcoholic fatty liver disease 133 Down 1.637640e-07 9.225371e-06
## Prion disease 234 Down 3.284426e-07 1.585908e-05
## Coronavirus disease - COVID-19 189 Down 9.854278e-07 4.163432e-05
## Huntington disease 264 Down 1.859282e-06 6.982637e-05
## Diabetic cardiomyopathy 182 Down 3.398315e-06 1.104084e-04
The top pathways list appears to have many neurodegenerative-themed gene sets.